/******************************************************************************
January 13, 2017
Sui-Jade Ho
Dimitrije Ruzic

Input(s): Manufacturing_1977_2001.dta

This .do file has 2 sections.
SECTION 1:
	- Constructs measures of productivity and distortions
	- Trims data
	
******************************************************************************/

clear programs
clear all
set more off
set matsize 11000

local data "/FILE PATH GOES HERE/"
	
local alphaL = "$alphaL"
local alphaK = "$alphaK"
local sigma = "$sigma"
local dtypes "ASM CMF"
local fname = "$fname"
local pftype = "$pftype"

/*SECTION 1: Constructing productivity and distortion measures*/
local S1 = 1

/*********************************************************
SECTION 1: 
	- Construct measures of productivity and distortion
	- Trim productivity and absolute distortion by absolute level:
		- Distortion defined as (1+tauL)^(alphaL)*(1+tauK)^(alphaK)
		Round 1:
		- Construct weighted averages within industry-year
		- Eliminate if observation is 5 times or more the industry-year 
			average
		- Eliminate if observation is 3 times or more the industry-year 
			average AND there are 100 or fewer establishment in the 
			industry-year
		Round 2:
		- Reconstruct the industry-year averages
		- Eliminate the top and bottom 1% of industry-years in there are
			more than 100 establishments in an industry-year
		- Within a year, pool all industries with 100 or fewer 
			establishments and eliminate the top and bottom 1% of 
			the pooled sample
	- Trimming in Sequential Order:
		- Round 1 TFPQ
		- Round 1 TAU
		- Round 2 TFPQ
		- Round 2 TAU
		
	- Constructing the Distortion:
	For TFPRratio we need 
	(MRPKbar/MRPK)^(alpkaK) = ((1/MRPKbar)/(1/MRPK))^(-alpkaK)
	= ((1/taukbar)/(1/tauk))^(-alpkaK)
	= ((K/PY_bar)/(K/PY))^(-alpkaK)
	
	I am going to construct 
	(K/PY_bar)/(K/PY)=(1/taukbar)/(1/tauk)=tauk*(1/taukbar)
	and use that expression to eliminate the most distorted establishments
		
**********************************************************/
if `S1' == 1 {
!gunzip /FILE PATH GOES HERE/Manufacturing_1977_2009.dta.gz
use "`data'Manufacturing_1977_2009.dta", clear
*!gzip /FILE PATH GOES HERE/Manufacturing_1977_2009.dta

keep et fk_naics02 year lbdnum PY2 Y2_* L K sw R wt
drop if year>2007

	/*Sample correction: assume wt = 1 when wt = 0*/
	replace wt = 1 if wt == 0

/*Construct variables to distinguish ASM and CMF Samples*/
gen ASM = 1 if et==0
gen CMF = 1 if year==1977 | year==1982 | year==1987 | year == 1992 | year == 1997 | year == 2002 | year == 2007

gen period = 1 if year<=1982
	replace period = 2 if year>1982 & year<=1987
	replace period = 3 if year>1987 & year<=1992
	replace period = 4 if year>1992 & year<=1997
	replace period = 5 if year>1997 & year<=2002
	replace period = 6 if year>2002 & year<=2007
	
	
gen dec = 1 if year<=1987
	replace dec = 2 if year>1987 & year<=1997
	replace dec = 3 if year>1997 & year<=2007

/*Merge in production-function parameters*/
merge m:1 fk period dec using "`data'PF_FK_ACF.dta"
drop if _m==2
drop _m

gen alphaL = `alphaL'
gen alphaK = `alphaK'
gen sigma = `sigma'
*gen Y2 = Y2_sigma`sigmaY'
gen double Y2 = (PY2)^(sigma/(sigma-1))

drop *acf_* *_hk_* sigma_* period dec

/*Dealing with potentially censored observations*/
drop if alphaL==.
drop if alphaK==.

/*Calculate physical productivity, TFPQ*/
gen double tfpq_ASM = Y2/((L^alphaL)*(K^alphaK))
gen double tfpq_CMF = tfpq_ASM

	replace tfpq_ASM = . if ASM!=1
	replace tfpq_CMF = . if CMF!=1

/*Trimming data relative a weighted average: TFPQ and TAU*/
/*Round 1a: Absolute Deviation from Average, TFPQ*/


local dtypes "ASM CMF"
sort fk_naics02 year
foreach d of local dtypes {
	if "`d'"=="ASM" {
		gen temp = PY2*wt if `d'==1
		by fk_naics02 year: egen py_weight_total_`d' = sum(temp) if `d'==1
		gen py_weight_`d' = (temp)/py_weight_total_`d'
	
		gen temp2 = tfpq_`d'*py_weight_`d'
		by fk_naics02 year: egen tfpq_average_`d' = sum(temp2) if `d'==1
		gen tfpq_trim1_`d' = tfpq_`d'/tfpq_average_`d'-1
	
		drop temp temp2
		}
	
	if "`d'"=="CMF" {
		gen temp = PY2 if `d'==1
		by fk_naics02 year: egen py_weight_total_`d' = sum(temp) if `d'==1
		gen py_weight_`d' = (temp)/py_weight_total_`d'
	
		gen temp2 = tfpq_`d'*py_weight_`d'
		by fk_naics02 year: egen tfpq_average_`d' = sum(temp2) if `d'==1
		gen tfpq_trim1_`d' = tfpq_`d'/tfpq_average_`d'-1
	
		drop temp temp2
		}
	
	by fk_naics02 year: egen est_count_`d' = sum(`d')
	gen flag_tfpq_`d'_1 = 1 if tfpq_trim1_`d'>4 & tfpq_trim1_`d'!=.
		replace flag_tfpq_`d'_1 = 1 if tfpq_trim1_`d'>=2 & tfpq_trim1_`d'<4 & est_count_`d'<100
	}
drop py_weight* tfpq_ave* est_count*


/*Round 1b: Absolute Deviation from Average, TAU*/	
/*Calculate relative distortions*/
local dtypes "ASM CMF"
sort fk_naics02 year
foreach d of local dtypes {
	if "`d'"=="ASM" {
		gen temp = PY2*wt if `d'==1 & flag_tfpq_`d'_1!=1
		by fk_naics02 year: egen py_weight_total = sum(temp) if `d'==1 & flag_tfpq_`d'_1!=1
		gen py_weight = (temp)/py_weight_total
		drop temp py_weight_total
		
		/*New tauK - FOC*/
		
		gen temp = (K/(PY2)) if `d'==1 & flag_tfpq_`d'_1 != 1
		gen temp2 = temp*py_weight
		by fk_naics02 year: egen tauk_bar = sum(temp2)
		gen tauk1_`d' = tauk_bar/temp
		drop tauk_bar temp temp2
		
		/*New tauL - FOC*/
		
		gen temp = (L/(PY2)) if `d'==1 & flag_tfpq_`d'_1 != 1
		gen temp2 = temp*py_weight
		by fk_naics02 year: egen taul_bar = sum(temp2)
		gen taul1_`d' = taul_bar/temp
		drop taul_bar temp temp2 py_weight
				
		}
		
	if "`d'"=="CMF" {
		gen temp = PY2 if `d'==1 & flag_tfpq_`d'_1!=1
		by fk_naics02 year: egen py_weight_total = sum(temp) if `d'==1 & flag_tfpq_`d'_1!=1
		gen py_weight = (temp)/py_weight_total
		drop temp py_weight_total

		/*New tauK - FOC*/
		
		gen temp = (K/(PY2)) if `d'==1 & flag_tfpq_`d'_1 != 1
		gen temp2 = temp*py_weight
		by fk_naics02 year: egen tauk_bar = sum(temp2)
		gen tauk1_`d' = tauk_bar/temp
		drop tauk_bar temp temp2
		
		/*New tauL - FOC*/
		
		gen temp = (L/(PY2)) if `d'==1 & flag_tfpq_`d'_1 != 1
		gen temp2 = temp*py_weight
		by fk_naics02 year: egen taul_bar = sum(temp2)
		gen taul1_`d' = taul_bar/temp
		drop taul_bar temp temp2 py_weight
		
		}
	}
	
/*Calculate overall distortion*/
gen TAU1_ASM = (taul1_ASM^alphaL)*(tauk1_ASM^alphaK)
gen TAU1_CMF = (taul1_CMF^alphaL)*(tauk1_CMF^alphaK)

local dtypes "ASM CMF"
sort fk_naics02 year
foreach d of local dtypes {
	if "`d'"=="ASM" {
		gen temp = PY2*wt if `d'==1 & flag_tfpq_`d'_1!=1
		by fk_naics02 year: egen py_weight_total_`d' = sum(temp) if `d'==1 & flag_tfpq_`d'_1!=1
		gen py_weight_`d' = (temp)/py_weight_total_`d'
	
		gen temp2 = TAU1_`d'*py_weight_`d'
		by fk_naics02 year: egen TAU_average_`d' = sum(temp2) if `d'==1 & flag_tfpq_`d'_1!=1
		gen TAU_trim1_`d' = TAU1_`d'/TAU_average_`d'-1
	
		drop temp temp2
		}
	
	if "`d'"=="CMF" {
		gen temp = PY2 if `d'==1 & flag_tfpq_`d'_1!=1
		by fk_naics02 year: egen py_weight_total_`d' = sum(temp) if `d'==1 & flag_tfpq_`d'_1!=1
		gen py_weight_`d' = (temp)/py_weight_total_`d'
	
		gen temp2 = TAU1_`d'*py_weight_`d'
		by fk_naics02 year: egen TAU_average_`d' = sum(temp2) if `d'==1 & flag_tfpq_`d'_1!=1
		gen TAU_trim1_`d' = TAU1_`d'/TAU_average_`d'-1
	
		drop temp temp2
		}
	
	gen temp = `d' if flag_tfpq_`d'_1!=1
	by fk_naics02 year: egen est_count_`d' = sum(temp)
	drop temp
	
	gen flag_TAU_`d'_1 = 1 if TAU_trim1_`d'>4 & TAU_trim1_`d'!=.
		replace flag_TAU_`d'_1 = 1 if TAU_trim1_`d'>=2 & TAU_trim1_`d'<4 & est_count_`d'<100
	}
drop py_weight* TAU_ave* est_count*

/*Round 2a: Top and Bottom 1%, TFPQ*/
local dtypes "ASM CMF"
foreach d of local dtypes {
	sort fk_naics02 year
	if "`d'"=="ASM" {
		gen temp = PY2*wt if `d'==1 & flag_tfpq_`d'_1!=1 & flag_TAU_`d'_1!=1
		by fk_naics02 year: egen py_weight_total_`d' = sum(temp) if `d'==1 & flag_tfpq_`d'_1!=1 & flag_TAU_`d'_1!=1
		gen py_weight_`d' = (temp)/py_weight_total_`d'
	
		gen temp2 = tfpq_`d'*py_weight_`d'
		by fk_naics02 year: egen tfpq_average_`d' = sum(temp2) if `d'==1 & flag_tfpq_`d'_1!=1 & flag_TAU_`d'_1!=1
		gen tfpq_trim2_`d' = tfpq_`d'/tfpq_average_`d'-1
	
		drop temp temp2
		}
	
	if "`d'"=="CMF" {
		gen temp = PY2 if `d'==1 & flag_tfpq_`d'_1!=1 & flag_TAU_`d'_1!=1
		by fk_naics02 year: egen py_weight_total_`d' = sum(temp) if `d'==1 & flag_tfpq_`d'_1!=1 & flag_TAU_`d'_1!=1
		gen py_weight_`d' = (temp)/py_weight_total_`d'
	
		gen temp2 = tfpq_`d'*py_weight_`d'
		by fk_naics02 year: egen tfpq_average_`d' = sum(temp2) if `d'==1 & flag_tfpq_`d'_1!=1 & flag_TAU_`d'_1!=1
		gen tfpq_trim2_`d' = tfpq_`d'/tfpq_average_`d'-1
	
		drop temp temp2
		}
		
	gen temp = `d' if flag_tfpq_`d'_1!=1 & flag_TAU_`d'_1!=1
	by fk_naics02 year: egen est_count_`d' = sum(temp)
			
	by fk_naics02 year: egen rank_`d' = rank(tfpq_trim2_`d'),  field
	by fk_naics02 year: egen sum_`d' = max(rank_`d')
	gen pct_`d' = (rank_`d'/sum_`d')*100
		
	gen flag_tfpq_`d'_2 = 1 if ((pct_`d' >=99 & pct_`d' <.) | pct_`d' <=1 ) & est_count_`d' >100
	drop rank_* sum_* pct_*
	drop temp
	
	gen temp = tfpq_trim2_`d' if est_count_`d' <= 100
	bys year: egen rank_`d' = rank(temp),  field
	by year: egen sum_`d' = max(rank_`d')
	gen pct_`d' = (rank_`d'/sum_`d')*100
		
	replace flag_tfpq_`d'_2 = 1 if ((pct_`d' >=99 & pct_`d' <.) | pct_`d' <=1 ) & est_count_`d' <=100
	drop rank_* sum_* pct_*
		
	drop temp
	}
drop py_weight* tfpq_ave* est_count*


/*Round 2b: Top and Bottom 1%, TAU*/
/*Calculate relative distortions*/
local dtypes "ASM CMF"
sort fk_naics02 year
foreach d of local dtypes {
	if "`d'"=="ASM" {
		gen temp = PY2*wt if `d'==1 & flag_tfpq_`d'_1!=1 & flag_TAU_`d'_1 != 1 & flag_tfpq_`d'_2 != 1
		by fk_naics02 year: egen py_weight_total = sum(temp) if `d'==1 & flag_tfpq_`d'_1!=1 & flag_TAU_`d'_1 != 1 & flag_tfpq_`d'_2 != 1
		gen py_weight = (temp)/py_weight_total
		drop temp py_weight_total
	
		/*New tauK - FOC*/
		
		gen temp = (K/(PY2)) if `d'==1 & flag_tfpq_`d'_1 != 1 & flag_TAU_`d'_1 != 1 & flag_tfpq_`d'_2 != 1
		gen temp2 = temp*py_weight
		by fk_naics02 year: egen tauk_bar = sum(temp2)
		gen tauk2_`d' = tauk_bar/temp
		drop tauk_bar temp temp2
		
		/*New tauL - FOC*/
		
		gen temp = (L/(PY2)) if `d'==1 & flag_tfpq_`d'_1 != 1 & flag_TAU_`d'_1 != 1 & flag_tfpq_`d'_2 != 1
		gen temp2 = temp*py_weight
		by fk_naics02 year: egen taul_bar = sum(temp2)
		gen taul2_`d' = taul_bar/temp
		drop taul_bar temp temp2 py_weight
		
		}
	if "`d'"=="CMF" {
		gen temp = PY2 if `d'==1 & flag_tfpq_`d'_1!=1 & flag_TAU_`d'_1 != 1 & flag_tfpq_`d'_2 != 1
		by fk_naics02 year: egen py_weight_total = sum(temp) if `d'==1 & flag_tfpq_`d'_1!=1 & flag_TAU_`d'_1 != 1 & flag_tfpq_`d'_2 != 1
		gen py_weight = (temp)/py_weight_total
		drop temp py_weight_total
		
		/*New tauK - FOC*/
		
		gen temp = (K/(PY2)) if `d'==1 & flag_tfpq_`d'_1 != 1 & flag_TAU_`d'_1 != 1 & flag_tfpq_`d'_2 != 1
		gen temp2 = temp*py_weight
		by fk_naics02 year: egen tauk_bar = sum(temp2)
		gen tauk2_`d' = tauk_bar/temp
		drop tauk_bar temp temp2
		
		/*New tauL - FOC*/
		
		gen temp = (L/(PY2)) if `d'==1 & flag_tfpq_`d'_1 != 1 & flag_TAU_`d'_1 != 1 & flag_tfpq_`d'_2 != 1
		gen temp2 = temp*py_weight
		by fk_naics02 year: egen taul_bar = sum(temp2)
		gen taul2_`d' = taul_bar/temp
		drop taul_bar temp temp2 py_weight
		
		}
	}
	
	
/*Calculate overall distortion*/
gen TAU2_ASM = (taul2_ASM^alphaL)*(tauk2_ASM^alphaK)
gen TAU2_CMF = (taul2_CMF^alphaL)*(tauk2_CMF^alphaK)

local dtypes "ASM CMF"
foreach d of local dtypes {
	sort fk_naics02 year
	if "`d'"=="ASM" {
		gen temp = PY2*wt if `d'==1 & flag_tfpq_`d'_1!=1 & flag_TAU_`d'_1!=1 & flag_tfpq_`d'_2!=1
		by fk_naics02 year: egen py_weight_total_`d' = sum(temp) if `d'==1 & flag_tfpq_`d'_1!=1 & flag_TAU_`d'_1!=1 & flag_tfpq_`d'_2!=1
		gen py_weight_`d' = (temp)/py_weight_total_`d'
	
		gen temp2 = TAU2_`d'*py_weight_`d'
		by fk_naics02 year: egen TAU_average_`d' = sum(temp2) if `d'==1 & flag_tfpq_`d'_1!=1 & flag_TAU_`d'_1!=1 & flag_tfpq_`d'_2!=1
		gen TAU_trim2_`d' = TAU2_`d'/TAU_average_`d'-1
	
		drop temp temp2
		}
	
	if "`d'"=="CMF" {
		gen temp = PY2 if `d'==1 & flag_tfpq_`d'_1!=1 & flag_TAU_`d'_1!=1 & flag_tfpq_`d'_2!=1
		by fk_naics02 year: egen py_weight_total_`d' = sum(temp) if `d'==1 & flag_tfpq_`d'_1!=1 & flag_TAU_`d'_1!=1 & flag_tfpq_`d'_2!=1
		gen py_weight_`d' = (temp)/py_weight_total_`d'
	
		gen temp2 = TAU2_`d'*py_weight_`d'
		by fk_naics02 year: egen TAU_average_`d' = sum(temp2) if `d'==1 & flag_tfpq_`d'_1!=1 & flag_TAU_`d'_1!=1 & flag_tfpq_`d'_2!=1
		gen TAU_trim2_`d' = TAU2_`d'/TAU_average_`d'-1
	
		drop temp temp2
		}
	
	gen temp = `d' if flag_tfpq_`d'_1!=1 & flag_TAU_`d'_1!=1 & flag_tfpq_`d'_2!=1
	by fk_naics02 year: egen est_count_`d' = sum(temp)
			
	by fk_naics02 year: egen rank_`d' = rank(TAU_trim2_`d'),  field
	by fk_naics02 year: egen sum_`d' = max(rank_`d')
	gen pct_`d' = (rank_`d'/sum_`d')*100
		
	gen flag_TAU_`d'_2 = 1 if ((pct_`d' >=99 & pct_`d' <.) | pct_`d' <=1 ) & est_count_`d' >100
	drop rank_* sum_* pct_*
	drop temp
	
	gen temp = TAU_trim2_`d' if est_count_`d' <= 100
	bys year: egen rank_`d' = rank(temp),  field
	by year: egen sum_`d' = max(rank_`d')
	gen pct_`d' = (rank_`d'/sum_`d')*100
		
	replace flag_TAU_`d'_2 = 1 if ((pct_`d' >=99 & pct_`d' <.) | pct_`d' <=1 ) & est_count_`d' <=100
	drop rank_* sum_* pct_*
		
	drop temp
	}
drop py_weight* TAU_ave* est_count*

local dtypes "ASM CMF"
sort fk_naics02 year
foreach d of local dtypes {
	gen flag_`d' = 1 if flag_tfpq_`d'_1==1 | flag_TAU_`d'_1==1 | flag_tfpq_`d'_2==1 | flag_TAU_`d'_2==1

	gen temp = 1 if `d'==1 & flag_`d'!=1
	/*Drop industries that have even a single year with 5 or fewer observations*/
	if "`d'"=="ASM" {
		by fk_naics02 year: egen est_count_`d'=sum(temp)
		gen temp2 = 1 if est_count_`d'<=5
		by fk_naics02: egen flag_ind_`d' = mean(temp2)
		drop temp temp2
		
		gen sample_`d' = 1 if `d'==1 & flag_`d'!=1 & flag_ind_`d'!=1
		}
		
	if "`d'"=="CMF" {
		by fk_naics02 year: egen est_count_`d'=sum(temp)
		gen temp2 = 1 if est_count_`d'<=5 & (year==1977 | year==1982 | year==1987 | year==1992 | year==1997 | year==2002 | year==2007)
		by fk_naics02: egen flag_ind_`d' = mean(temp2)
		drop temp temp2
		
		gen sample_`d' = 1 if `d'==1 & flag_`d'!=1 & flag_ind_`d'!=1
		}
	}

keep lbdnum year fk_naics02 ASM CMF sample*
save "`data'Misallocation_Sample_`fname'.dta", replace


}






